Student Performance
Dataset link
link = https://archive.ics.uci.edu/dataset/320/student+performance
About The Data
The datasets student-mat.csv and student-por.csv contain information about students enrolled in Math and Portuguese language courses. The datasets include various attributes related to student demographics, academic background, and social factors.
Key attributes include:
School: The student’s school (“GP” for Gabriel Pereira, “MS” for Mousinho da Silveira).
Sex: Gender (“F” for female, “M” for male).
Age: Age (numeric, 15–22).
Family Background: Attributes like family size, parents’ education, and cohabitation status.
Parental Jobs: Occupation of the mother and father.
Study and Activity Attributes: Study time, extracurricular activities, school attendance, and romantic relationships.
Health & Social Behaviors: Alcohol consumption, health status, family relationships, and free time.
Grades: G1, G2, and G3 represent the first period, second period, and final grades (numeric, from 0 to 20).
# install.packages("ucimlrepo")
# install.packages("dplyr")
# install.packages("mlbench")
# install.packages("fastDummies")
# install.packages("caret")
library(ucimlrepo)
library(dplyr)
library(mlbench)
# Fetching dataset with ID 320(student Performance dataset)
student_performance <- fetch_ucirepo(id = 320)
# Accessing the feature(X) and target(Y)
X <- student_performance$data$features
Y <- student_performance$data$targets
# combining both dataset
data <- cbind(X,Y)
head(data)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 1 GP F 18 U GT3 A 4 4 at_home teacher course
## 2 GP F 17 U GT3 T 1 1 at_home other course
## 3 GP F 15 U LE3 T 1 1 at_home other other
## 4 GP F 15 U GT3 T 4 2 health services home
## 5 GP F 16 U GT3 T 3 3 other other home
## 6 GP M 16 U LE3 T 4 3 services other reputation
## guardian traveltime studytime failures schoolsup famsup paid activities
## 1 mother 2 2 0 yes no no no
## 2 father 1 2 0 no yes no no
## 3 mother 1 2 0 yes no no no
## 4 mother 1 3 0 no yes no yes
## 5 father 1 2 0 no yes no no
## 6 mother 1 2 0 no yes no yes
## nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1 yes yes no no 4 3 4 1 1 3
## 2 no yes yes no 5 3 3 1 1 3
## 3 yes yes yes no 4 3 2 2 3 3
## 4 yes yes yes yes 3 2 2 1 1 5
## 5 yes yes no no 4 3 2 1 2 5
## 6 yes yes yes no 5 4 2 1 2 5
## absences G1 G2 G3
## 1 4 0 11 11
## 2 2 9 11 11
## 3 6 12 13 12
## 4 0 14 14 14
## 5 0 11 13 13
## 6 6 12 12 13
# Looking into data
str(data)
## 'data.frame': 649 obs. of 33 variables:
## $ school : chr "GP" "GP" "GP" "GP" ...
## $ sex : chr "F" "F" "F" "F" ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : chr "U" "U" "U" "U" ...
## $ famsize : chr "GT3" "GT3" "LE3" "GT3" ...
## $ Pstatus : chr "A" "T" "T" "T" ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : chr "at_home" "at_home" "at_home" "health" ...
## $ Fjob : chr "teacher" "other" "other" "services" ...
## $ reason : chr "course" "course" "other" "home" ...
## $ guardian : chr "mother" "father" "mother" "mother" ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 0 0 0 0 0 0 0 0 ...
## $ schoolsup : chr "yes" "no" "yes" "no" ...
## $ famsup : chr "no" "yes" "no" "yes" ...
## $ paid : chr "no" "no" "no" "no" ...
## $ activities: chr "no" "no" "no" "yes" ...
## $ nursery : chr "yes" "no" "yes" "yes" ...
## $ higher : chr "yes" "yes" "yes" "yes" ...
## $ internet : chr "no" "yes" "yes" "yes" ...
## $ romantic : chr "no" "no" "no" "yes" ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 4 2 6 0 0 6 0 2 0 0 ...
## $ G1 : int 0 9 12 14 11 12 13 10 15 12 ...
## $ G2 : int 11 11 13 14 13 12 12 13 16 12 ...
## $ G3 : int 11 11 12 14 13 13 13 13 17 13 ...
# Looking if their is any missing value
sum(is.na(data))
## [1] 0
library(dplyr)
data <- data %>%
mutate(across(c(school, sex, famsize, Pstatus, Mjob, Fjob, reason, guardian, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic), as.factor))
str(data)
## 'data.frame': 649 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : chr "U" "U" "U" "U" ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 0 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 4 2 6 0 0 6 0 2 0 0 ...
## $ G1 : int 0 9 12 14 11 12 13 10 15 12 ...
## $ G2 : int 11 11 13 14 13 12 12 13 16 12 ...
## $ G3 : int 11 11 12 14 13 13 13 13 17 13 ...
library(dplyr)
corr_matrix <- data %>%
select(age, Medu, Fedu, traveltime, failures, studytime,
famrel, freetime, goout, Dalc, Walc, health, absences,
G1, G2, G3) %>%
cor()
# Show only the correlation of each variable with G3
corr_with_G3 <- corr_matrix[, "G3"]
print(corr_with_G3)
## age Medu Fedu traveltime failures studytime
## -0.10650539 0.24015076 0.21179968 -0.12717297 -0.39331555 0.24978869
## famrel freetime goout Dalc Walc health
## 0.06336113 -0.12270493 -0.08764072 -0.20471940 -0.17661887 -0.09885124
## absences G1 G2 G3
## -0.09137906 0.82638712 0.91854800 1.00000000
## corrplot 0.95 loaded
p ≤ 0.05 The result is statistically significant ✅ Yes
p > 0.05 The result is not statistically significant ❌ No
# Analysis of G3 and School
summary(aov(G3 ~ school, data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## school 1 547 546.6 56.89 1.57e-13 ***
## Residuals 647 6217 9.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and famsize
summary(aov(G3 ~ famsize , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## famsize 1 14 13.71 1.314 0.252
## Residuals 647 6750 10.43
# Analysis of G3 and famsize
summary(aov(G3 ~ Pstatus , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## Pstatus 1 0 0.004 0 0.985
## Residuals 647 6763 10.453
# Analysis of G3 and sex
summary(aov(G3 ~ sex, data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 113 112.68 10.96 0.000982 ***
## Residuals 647 6651 10.28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and Mjob
summary(aov(G3 ~ Mjob , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## Mjob 4 296 74.01 7.37 8.31e-06 ***
## Residuals 644 6467 10.04
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and Fjob
summary(aov(G3 ~ Fjob , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## Fjob 4 135 33.68 3.273 0.0114 *
## Residuals 644 6629 10.29
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and reason
summary(aov(G3 ~ reason , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## reason 3 308 102.57 10.25 1.34e-06 ***
## Residuals 645 6456 10.01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and schoolsup
summary(aov(G3 ~ schoolsup , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## schoolsup 1 30 29.82 2.866 0.091 .
## Residuals 647 6733 10.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and fampsu
summary(aov(G3 ~ famsup , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## famsup 1 24 23.71 2.276 0.132
## Residuals 647 6740 10.42
# Analysis of G3 and paid
summary(aov(G3 ~ paid, data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## paid 1 20 20.38 1.956 0.162
## Residuals 647 6743 10.42
# Analysis of G3 and activities
summary(aov(G3 ~ activities , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## activities 1 24 24.18 2.321 0.128
## Residuals 647 6739 10.42
# Analysis of G3 and nursery
summary(aov(G3 ~ nursery , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## nursery 1 6 5.591 0.535 0.465
## Residuals 647 6758 10.445
# Analysis of G3 and higher
summary(aov(G3 ~ higher , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## higher 1 746 746.2 80.24 <2e-16 ***
## Residuals 647 6017 9.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and internet
summary(aov(G3 ~ internet , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## internet 1 152 152.22 14.9 0.000125 ***
## Residuals 647 6611 10.22
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of G3 and romantic
summary(aov(G3 ~ romantic , data = data))
## Df Sum Sq Mean Sq F value Pr(>F)
## romantic 1 55 55.49 5.353 0.021 *
## Residuals 647 6708 10.37
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggplot2)
ggplot(data , aes(x= school , y = G3 , fill = school)) +
geom_boxplot()
ggplot(data , aes(x= sex , y = G3 , fill = sex)) +
geom_boxplot()
ggplot(data , aes(x= Mjob , y = G3 , fill = Mjob)) +
geom_boxplot()
ggplot(data , aes(x= Fjob , y = G3 , fill = Fjob)) +
geom_boxplot()
ggplot(data , aes(x= reason , y = G3 , fill = reason)) +
geom_boxplot()
ggplot(data , aes(x= higher , y = G3 , fill = higher)) +
geom_boxplot()
ggplot(data , aes(x= internet , y = G3 , fill = internet)) +
geom_boxplot()
ggplot(data , aes(x= romantic , y = G3 , fill = romantic)) +
geom_boxplot()
library(fastDummies)
library(dplyr)
encoded_data <- dummy_cols(data , select_columns = c("internet","higher","romantic"),remove_selected_columns = TRUE,remove_first_dummy = TRUE)
colnames(encoded_data)
## [1] "school" "sex" "age" "address" "famsize"
## [6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
## [11] "reason" "guardian" "traveltime" "studytime" "failures"
## [16] "schoolsup" "famsup" "paid" "activities" "nursery"
## [21] "famrel" "freetime" "goout" "Dalc" "Walc"
## [26] "health" "absences" "G1" "G2" "G3"
## [31] "internet_yes" "higher_yes" "romantic_yes"
encoded_data <- encoded_data %>%
select(-famsize,-Pstatus,-schoolsup,-famsup,-paid,-activities,-nursery)
library(caret)
## Loading required package: lattice
set.seed(123)
# Data into 80% and 20%
split <- createDataPartition(encoded_data$G3 , p = 0.8 , list = FALSE)
train_data <- encoded_data[split, ]
test_data <- encoded_data[-split, ]
model <- lm(G3 ~ ., data = train_data)
residuals <- resid(model)
# Histogram of residuals
hist(residuals)
# Q-Q plot for normality
qqnorm(residuals)
qqline(residuals)
LM_Model <- train(
G3 ~ . ,
data = train_data,
method = "lm"
)
summary(LM_Model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8352 -0.5189 -0.0036 0.6036 5.0036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.762731 1.041534 0.732 0.46433
## schoolMS -0.289032 0.143670 -2.012 0.04480 *
## sexM -0.095076 0.128155 -0.742 0.45852
## age 0.018109 0.052968 0.342 0.73259
## addressU 0.131752 0.142023 0.928 0.35403
## Medu -0.095752 0.081812 -1.170 0.24242
## Fedu 0.059187 0.072831 0.813 0.41681
## Mjobhealth 0.302100 0.290155 1.041 0.29832
## Mjobother -0.198507 0.161067 -1.232 0.21838
## Mjobservices 0.122871 0.196158 0.626 0.53135
## Mjobteacher 0.186508 0.260118 0.717 0.47371
## Fjobhealth -0.500919 0.417590 -1.200 0.23090
## Fjobother -0.450198 0.248414 -1.812 0.07056 .
## Fjobservices -0.572066 0.260882 -2.193 0.02879 *
## Fjobteacher -0.607158 0.363081 -1.672 0.09512 .
## reasonhome -0.157382 0.150565 -1.045 0.29642
## reasonother -0.509493 0.197128 -2.585 0.01004 *
## reasonreputation -0.237993 0.158982 -1.497 0.13505
## guardianmother -0.106169 0.142556 -0.745 0.45678
## guardianother 0.262006 0.280541 0.934 0.35080
## traveltime 0.173910 0.084351 2.062 0.03977 *
## studytime 0.018560 0.074135 0.250 0.80242
## failures -0.340216 0.116323 -2.925 0.00361 **
## famrel -0.050596 0.064879 -0.780 0.43586
## freetime -0.060300 0.059297 -1.017 0.30971
## goout 0.014457 0.057324 0.252 0.80100
## Dalc -0.008894 0.083555 -0.106 0.91527
## Walc -0.033729 0.062925 -0.536 0.59219
## health -0.031120 0.041640 -0.747 0.45520
## absences 0.014339 0.013128 1.092 0.27528
## G1 0.141916 0.044157 3.214 0.00140 **
## G2 0.870382 0.039524 22.021 < 2e-16 ***
## internet_yes 0.071404 0.144487 0.494 0.62140
## higher_yes 0.196222 0.215570 0.910 0.36314
## romantic_yes 0.044044 0.121725 0.362 0.71763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.271 on 485 degrees of freedom
## Multiple R-squared: 0.8633, Adjusted R-squared: 0.8537
## F-statistic: 90.1 on 34 and 485 DF, p-value: < 2.2e-16
predictions <- predict(LM_Model , newdata = test_data)
actuals <- test_data$G3
postResample(pred = predictions, obs = actuals)
## RMSE Rsquared MAE
## 1.1827269 0.8290849 0.7861066
# Plotting Actual vs Predicted
plot(test_data$G3 , predictions,
xlab = "Actual G3",
ylab = "Predicted G3",
main = "Actual vs Predicted Linear Regression",
pch = 19, col = "blue")
abline(0, 1, col = "red") # reference line)
train_control <- trainControl(
method = "cv",
number = 10,
)
GLM_Model <- train(
G3 ~ . ,
data = train_data,
method = "glmnet",
trControl = train_control,
metric = c("RMSE")
)
print(GLM_Model)
## glmnet
##
## 520 samples
## 25 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 469, 467, 467, 468, 470, 469, ...
## Resampling results across tuning parameters:
##
## alpha lambda RMSE Rsquared MAE
## 0.10 0.006103841 1.263307 0.8661025 0.8416234
## 0.10 0.061038413 1.257684 0.8663480 0.8360885
## 0.10 0.610384126 1.299546 0.8602287 0.8459695
## 0.55 0.006103841 1.261507 0.8663817 0.8402252
## 0.55 0.061038413 1.238777 0.8691543 0.8224813
## 0.55 0.610384126 1.302952 0.8680741 0.8264794
## 1.00 0.006103841 1.259085 0.8666304 0.8381556
## 1.00 0.061038413 1.232570 0.8696969 0.8188343
## 1.00 0.610384126 1.353099 0.8682164 0.8804954
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.06103841.
# Access final results
best_model_results <- GLM_Model$results[
GLM_Model$results$alpha == GLM_Model$bestTune$alpha &
GLM_Model$results$lambda == GLM_Model$bestTune$lambda,
]
# Print RMSE, R-squared, and MAE
cat("Best RMSE:", best_model_results$RMSE, "\n")
## Best RMSE: 1.23257
cat("Best R-squared:", best_model_results$Rsquared, "\n")
## Best R-squared: 0.8696969
cat("Best MAE:", best_model_results$MAE, "\n")
## Best MAE: 0.8188343
predictions_GLM <- predict(GLM_Model , newdata = test_data)
actuals_GLM <- test_data$G3
postResample(pred = predictions_GLM, obs = actuals_GLM)
## RMSE Rsquared MAE
## 1.1346729 0.8397974 0.7292078
# Plotting Actual vs Predicted
plot(test_data$G3 , predictions_GLM,
xlab = "Actual G3",
ylab = "Predicted G3",
main = "Actual vs Predicted GLMNET",
pch = 19, col = "blue")
abline(0, 1, col = "red") # reference line)
library(e1071)
library(quantmod)
library(kernlab)
svm_model <- train(G3 ~ . ,
data = train_data,
method = "svmRadial",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 5)
svm_linear <- train(G3 ~ . ,
data = train_data,
method = "svmLinear",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 5)
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel
##
## 520 samples
## 25 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 468, 467, 468, 468, 468, 469, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.735138 0.7742168 1.0406233
## 0.50 1.626734 0.7887121 0.9768147
## 1.00 1.544052 0.8006845 0.9536633
## 2.00 1.500242 0.8040384 0.9683193
## 4.00 1.508932 0.7968974 1.0049898
##
## Tuning parameter 'sigma' was held constant at a value of 0.01765752
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01765752 and C = 2.
#Best Model
print(svm_linear)
## Support Vector Machines with Linear Kernel
##
## 520 samples
## 25 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 467, 466, 469, 467, 469, 469, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.263597 0.8525412 0.7958404
##
## Tuning parameter 'C' was held constant at a value of 1
# Predict on test data
pred_svm <- predict(svm_linear, newdata = test_data)
actual_svm <- test_data$G3
postResample(pred = pred_svm, obs = actual_svm)
## RMSE Rsquared MAE
## 1.1124462 0.8444150 0.7324652
# Ploting Actual vs Predicted
plot(test_data$G3 , pred_svm,
xlab = "Actual Point",
ylab = "Predicited Point",
main = "Actual vs Predicted SVM",
pch = 19, col = "blue")
abline(0, 1, col = "red")
knn_model <- train(
G3~ .,
data = train_data,
method = "knn",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 10 # k = 1 to 10
)
print(knn_model)
## k-Nearest Neighbors
##
## 520 samples
## 25 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 467, 466, 468, 468, 470, 470, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 2.301127 0.5577571 1.645487
## 7 2.297209 0.5767190 1.625549
## 9 2.300054 0.5894665 1.610932
## 11 2.294154 0.5963439 1.617475
## 13 2.309007 0.5971723 1.627464
## 15 2.316585 0.6107240 1.627697
## 17 2.323524 0.6164017 1.630080
## 19 2.308500 0.6339417 1.622317
## 21 2.317659 0.6393866 1.633124
## 23 2.307599 0.6533244 1.629211
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
plot(knn_model)
# prediction
pred_knn <- predict(knn_model, newdata = test_data)
actual_knn <- test_data$G3
postResample(pred = pred_knn, obs = actual_knn)
## RMSE Rsquared MAE
## 2.0234546 0.4931356 1.4763918
# Plotting Actual vs Predicted Point
plot(actual_knn , pred_knn,
xlab = "Actual Point",
ylab = "Predicted Point",
main = "Actual vs Predicted KNN",
pch = 19, col = "blue")
abline(0, 1, col = "red")
library(rpart)
library(rpart.plot)
library(caret)
DT_model <- train(G3 ~ .,
data = train_data,
method = "rpart",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(DT_model)
## CART
##
## 520 samples
## 25 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 467, 468, 469, 468, 468, 469, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.002726407 1.410894 0.8203079 0.9045836
## 0.003648967 1.440506 0.8129849 0.9198853
## 0.005194941 1.436188 0.8133545 0.9115749
## 0.013105992 1.464972 0.8057678 0.9369913
## 0.016933022 1.514198 0.7924949 1.0073416
## 0.019677337 1.531827 0.7885625 1.0279373
## 0.054730025 1.763985 0.7337188 1.1847924
## 0.087782927 1.908757 0.6795113 1.3507883
## 0.146849557 2.221369 0.5691557 1.5603428
## 0.512011506 2.722852 0.4918340 1.9699223
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.002726407.
rpart.plot(DT_model$finalModel)
# prediction
pred_DT <- predict(DT_model, newdata = test_data)
actual_DT <- test_data$G3
postResample(pred = pred_DT, obs = actual_DT)
## RMSE Rsquared MAE
## 1.1715342 0.8280618 0.6960317
# Plotting Actual vs Predicted
plot(actual_DT , pred_DT,
xlab = "Actual Point",
ylab = "Predicted Point",
main = "Actual vs Predicted DT",
pch = 19, col = "blue")
abline(0, 1, col = "red")
library(randomForest)
RF_model <- train(G3~ . ,
data = train_data,
method = "rf",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 5)
print(RF_model)
## Random Forest
##
## 520 samples
## 25 predictor
##
## Pre-processing: centered (34), scaled (34)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 469, 468, 469, 468, 468, 468, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2.012521 0.7373867 1.3318100
## 10 1.338777 0.8553028 0.8641057
## 18 1.296631 0.8611280 0.8411014
## 26 1.293885 0.8596675 0.8463808
## 34 1.319895 0.8517951 0.8586288
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 26.
plot(RF_model)
# prediction
pred_RF <- predict(RF_model , newdata = test_data)
actual_RF <- test_data$G3
postResample(pred = pred_RF , obs = actual_RF)
## RMSE Rsquared MAE
## 1.1549628 0.8341529 0.7232408
# Plotting Actual vs Predicted
plot(actual_RF , pred_RF,
xlab = "Actual Point",
ylab = "Predicted Point",
main = "Actual vs Predicted RF",
pch = 19, col = "blue")
abline(0, 1, col = "red")
XGB_model <- train(
G3 ~ . ,
data = train_data,
method = "xgbTree",
trControl = train_control,
preProcess = c("center","scale"),
tuneLength = 5
)
# Extract the best training performance
best_metrics <- XGB_model$results[
XGB_model$results$nrounds == XGB_model$bestTune$nrounds &
XGB_model$results$max_depth == XGB_model$bestTune$max_depth &
XGB_model$results$eta == XGB_model$bestTune$eta &
XGB_model$results$gamma == XGB_model$bestTune$gamma &
XGB_model$results$colsample_bytree == XGB_model$bestTune$colsample_bytree &
XGB_model$results$min_child_weight == XGB_model$bestTune$min_child_weight &
XGB_model$results$subsample == XGB_model$bestTune$subsample,
]
print(best_metrics)
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 41 0.3 1 0 0.8 1 0.875 50
## RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 41 1.320218 0.846094 0.8551095 0.4410115 0.07151767 0.158168
# View the best tuning parameters
XGB_model$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 41 50 1 0.3 0 0.8 1 0.875
plot(XGB_model)
# prediction
pred_XGB <- predict(XGB_model , newdata = test_data)
actual_XGB <- test_data$G3
postResample(pred = pred_XGB , obs = actual_XGB)
## RMSE Rsquared MAE
## 1.1975426 0.8209977 0.7572757